import os
import pickle
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
transaction_data = pd.read_csv("data/takehome_ds_written.csv")
print(transaction_data.head(10))
print("\nData Length: ", len(transaction_data))
print("\nStart Date: ", min(transaction_data["time"]))
print("\nEnd Date: ", max(transaction_data["time"]))
print("\nUnique Merchants", len(transaction_data["merchant"].unique()))
Unnamed: 0 merchant time amount_usd_in_cents 0 1 faa029c6b0 2034-06-17 23:34:14 6349 1 2 ed7a7d91aa 2034-12-27 00:40:38 3854 2 3 5608f200cf 2034-04-30 01:29:42 789 3 4 15b1a0d61e 2034-09-16 01:06:23 4452 4 5 4770051790 2034-07-22 16:21:42 20203 5 6 3fc4f08d9d 2034-08-12 10:54:48 6190 6 7 1c8b539073 2033-03-10 18:46:49 715 7 8 77f961f9b3 2034-12-09 21:27:18 7211 8 9 9cb79b7700 2034-03-27 23:01:13 23267 9 10 2f95149dc3 2034-01-24 21:17:22 2572 Data Length: 1513719 Start Date: 2033-01-01 09:38:32 End Date: 2034-12-31 07:59:40 Unique Merchants 14351
# Checking Missing Data
print(transaction_data.isnull().sum())
transaction_data.drop('Unnamed: 0', axis=1, inplace=True)
Unnamed: 0 0 merchant 0 time 0 amount_usd_in_cents 0 dtype: int64
New features extracted from the time column.
Convert amount in cents to dollar for readability.
transaction_data["time"] = pd.to_datetime(transaction_data["time"])
transaction_data["date"] = transaction_data["time"].dt.date
transaction_data["month"] = transaction_data["time"].dt.month
transaction_data["week_day"] = transaction_data["time"].dt.weekday
transaction_data["hour"] = transaction_data["time"].dt.hour
transaction_data["quarter"] = transaction_data["time"].dt.quarter
transaction_data["year"] = transaction_data["time"].dt.year
transaction_data["amount"] = transaction_data["amount_usd_in_cents"]/100
#print(transaction_data.head(10))
#print(transaction_data["amount"].describe())
daily_trend = transaction_data.groupby(["date"], as_index=False)["amount"].sum()
fig1 = px.line(daily_trend, x='date', y='amount',
title='Trend of Transaction Amount across Merchants',
labels={
"date": "Date",
"amount": "Amount (USD)",
})
fig1.update_xaxes(
rangeslider_visible=True,
rangeselector=dict(
buttons=list([
dict(count=1, label="1m", step="month", stepmode="backward"),
dict(count=6, label="6m", step="month", stepmode="backward"),
dict(count=1, label="YTD", step="year", stepmode="todate"),
dict(count=1, label="1y", step="year", stepmode="backward"),
dict(step="all")
])
)
)
daily_trend_count = transaction_data.groupby(["date", "week_day"], as_index=False)["amount"].count().rename(columns={'amount':'num_transaction'})
fig1 = px.line(daily_trend_count, x='date', y='num_transaction',
title='Trend of Number of Transaction',
labels={
"date": "Date",
"num_transaction": "Number of Transactions",
})
fig1.update_xaxes(
rangeslider_visible=True,
rangeselector=dict(
buttons=list([
dict(count=1, label="1m", step="month", stepmode="backward"),
dict(count=6, label="6m", step="month", stepmode="backward"),
dict(count=1, label="YTD", step="year", stepmode="todate"),
dict(count=1, label="1y", step="year", stepmode="backward"),
dict(step="all")
])
)
)
The merchant transactions are filtered one merchant at a time and store in pickel file separately in the local folder.
#for merchant in tqdm(merchant_type["merchant"]):
# data_filtered = transaction_data[transaction_data["merchant"] == merchant]
# file = open('data/merchant_breakup/'+merchant, 'wb')
# pickle.dump(data_filtered, file)
# file.close()
merchant_list = os.listdir("data/merchant_breakup/")
Interested in understanding payments activity to try to infer the types of merchants using Stripe. Generate assignments for each merchant.
# Dictionary to store Merchant Tags
merchant_type = {
'merchant': merchant_list,
}
Assumption:
Day Store timing: 6AM to 6PM
• All day stores might be online merchants, gas stations or medical stores
• Morning only store might be vegetable vendors, breakfast places, etc.
• Night store might be dinner restaurants, bars etc.
transaction_data["day_store"] = np.where((transaction_data["hour"] >= 6) & (transaction_data["hour"] < 18),
"day", "night")
unique_merch_time = transaction_data.drop_duplicates(["merchant", "day_store"])[["merchant", "day_store"]]
grouped_data = unique_merch_time.groupby(["merchant"], as_index=False).count()
allday_shop = set(grouped_data[grouped_data["day_store"] == 2]["merchant"])
timing_dict = dict(zip(transaction_data["merchant"], transaction_data["day_store"]))
for key, value in timing_dict.items():
if key in allday_shop:
timing_dict[key] = "all_day"
store_time = []
for merchant in merchant_list:
store_time.append(timing_dict[merchant])
merchant_type["store_time"] = store_time
# Distribution of shops based on operation time.
print(merchant_type["store_time"].count("all_day"))
print(merchant_type["store_time"].count("night"))
print(merchant_type["store_time"].count("day"))
10677 2655 1019
An idea of how huge per transactions are normally.
Electronics shops would normally have bigger amount values than grocery stores.
median_sales = []
for merchant in tqdm(merchant_list):
file = open("data/merchant_breakup/"+merchant, 'rb')
data_filtered = pickle.load(file)
file.close()
median_value = data_filtered["amount"].median()
median_sales.append(median_value)
merchant_type["median_sales"] = median_sales
# Dynamic Thresholding based on percentile values for Amount Sales each Merchant
low_spend_threshold = np.percentile(merchant_type["median_sales"], 25)
mediumLow_spend_threshold = np.percentile(merchant_type["median_sales"], 50)
mediumHigh_spend_threshold = np.percentile(merchant_type["median_sales"], 75)
#print(low_spend_threshold, mediumLow_spend_threshold, mediumHigh_spend_threshold)
merchant_sales_profile = []
for idx in range(len(merchant_type["merchant"])):
if merchant_type["median_sales"][idx] < low_spend_threshold:
tag = "low"
elif mediumLow_spend_threshold > merchant_type["median_sales"][idx] >= low_spend_threshold:
tag = "mediumLow"
elif mediumHigh_spend_threshold > merchant_type["median_sales"][idx] >= mediumLow_spend_threshold:
tag = "mediumHigh"
else:
tag = "high"
merchant_sales_profile.append(tag)
merchant_type["sales_transaction_profile"] = merchant_sales_profile
Answer to how many transactions were done for the merchant in the 2 years.
Electronics shops would normally have lesser number of transactions than grocery stores.
num_transactions = []
for merchant in tqdm(merchant_list):
file = open("data/merchant_breakup/"+merchant, 'rb')
data_filtered = pickle.load(file)
file.close()
num_transactions.append(len(data_filtered))
merchant_type["num_transactions"] = num_transactions
# Dynamic Thresholding based on percentile values for Number of Transaction for each Merchant
low_threshold = np.percentile(merchant_type["num_transactions"], 25)
mediumLow_threshold = np.percentile(merchant_type["num_transactions"], 50)
mediumHigh_threshold = np.percentile(merchant_type["num_transactions"], 75)
#print(low_threshold, mediumLow_threshold, mediumHigh_threshold)
merchant_trans = []
for idx in range(len(merchant_type["merchant"])):
if merchant_type["num_transactions"][idx] < low_threshold:
tag = "low"
elif mediumLow_threshold > merchant_type["num_transactions"][idx] >= low_threshold:
tag = "mediumLow"
elif mediumHigh_threshold > merchant_type["num_transactions"][idx] >= mediumLow_threshold:
tag = "mediumHigh"
else:
tag = "high"
merchant_trans.append(tag)
merchant_type["sales_transaction_count"] = merchant_trans
Business that do very high sales during summer or few others during holiday season. Weekend sales vs Weekday sales.
a) Number of transactions in the quarter.
q1 = []
q2 = []
q3 = []
q4 = []
for merchant in tqdm(merchant_list):
file = open("data/merchant_breakup/"+merchant, 'rb')
data_filtered = pickle.load(file)
file.close()
q1_data = data_filtered[data_filtered["quarter"] == 1]
q1.append(len(q1_data))
q2_data = data_filtered[data_filtered["quarter"] == 2]
q2.append(len(q2_data))
q3_data = data_filtered[data_filtered["quarter"] == 3]
q3.append(len(q3_data))
q4_data = data_filtered[data_filtered["quarter"] == 4]
q4.append(len(q4_data))
merchant_type["q1_transactions"] = q1
merchant_type["q2_transactions"] = q2
merchant_type["q3_transactions"] = q3
merchant_type["q4_transactions"] = q4
b) Amount of the transactions in the quarter
q1 = []
q2 = []
q3 = []
q4 = []
for merchant in tqdm(merchant_list):
file = open("data/merchant_breakup/"+merchant, 'rb')
data_filtered = pickle.load(file)
file.close()
q1_data = data_filtered[data_filtered["quarter"] == 1]
q1.append(sum(q1_data["amount"]))
q2_data = data_filtered[data_filtered["quarter"] == 2]
q2.append(sum(q2_data["amount"]))
q3_data = data_filtered[data_filtered["quarter"] == 3]
q3.append(sum(q3_data["amount"]))
q4_data = data_filtered[data_filtered["quarter"] == 4]
q4.append(sum(q4_data["amount"]))
merchant_type["q1_sales"] = q1
merchant_type["q2_sales"] = q2
merchant_type["q3_sales"] = q3
merchant_type["q4_sales"] = q4
c) Weekend sales and Weekday sales average
avg_weekday = []
avg_weekend = []
for merchant in tqdm(merchant_list):
file = open("data/merchant_breakup/"+merchant, 'rb')
data_filtered = pickle.load(file)
file.close()
weekday_data = data_filtered[data_filtered["week_day"].isin([0,1,2,3,4])]
weekend_data = data_filtered[data_filtered["week_day"].isin([5,6])]
avg_weekday.append(np.mean(weekday_data["amount"]))
avg_weekend.append(np.mean(weekend_data["amount"]))
merchant_type["avg_weekday"] = avg_weekday
merchant_type["avg_weekend"] = avg_weekend
merchant_attributes = pd.DataFrame(merchant_type)
merchant_attributes.fillna(0, inplace=True)
merchant_attributes.head()
| merchant | store_time | median_sales | sales_transaction_profile | num_transactions | sales_transaction_count | q1_transactions | q2_transactions | q3_transactions | q4_transactions | q1_sales | q2_sales | q3_sales | q4_sales | avg_weekday | avg_weekend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ef1932584c | all_day | 43.00 | low | 7 | mediumLow | 0 | 0 | 7 | 0 | 0.00 | 0.00 | 700.07 | 0.00 | 122.854000 | 42.90 |
| 1 | 26afdd587f | night | 237.21 | high | 5 | mediumLow | 0 | 1 | 4 | 0 | 0.00 | 162.99 | 992.04 | 0.00 | 257.196667 | 191.72 |
| 2 | 40e34cf99b | all_day | 15.77 | low | 3 | mediumLow | 0 | 1 | 2 | 0 | 0.00 | 16.28 | 29.90 | 0.00 | 15.393333 | 0.00 |
| 3 | 49054f472d | all_day | 92.20 | mediumHigh | 3 | mediumLow | 0 | 0 | 0 | 3 | 0.00 | 0.00 | 0.00 | 250.98 | 83.660000 | 0.00 |
| 4 | eacdd1a861 | day | 108.06 | mediumHigh | 2 | low | 1 | 1 | 0 | 0 | 23.05 | 193.07 | 0.00 | 0.00 | 0.000000 | 108.06 |
transaction_data = transaction_data.sort_values(["time"])
transaction_data["quarter_year"] = "Q" + transaction_data["quarter"].astype(str) + ", "+ transaction_data["year"].astype(str)
transaction_data['quarter_year'] = pd.Categorical(transaction_data['quarter_year'],
['Q1, 2033', 'Q2, 2033', 'Q3, 2033', 'Q4, 2033',
'Q1, 2034', 'Q2, 2034', 'Q3, 2034', 'Q4, 2034'])
# Merchant associated with each quarter.
churn_data = pd.DataFrame(transaction_data.groupby(["quarter_year"])['merchant'].apply(set))
churn_data.reset_index(inplace=True)
churn_data
| quarter_year | merchant | |
|---|---|---|
| 0 | Q1, 2033 | {309de2d8a2, b4349f967a, 1d5e86fa15, 32c922fbc... |
| 1 | Q2, 2033 | {4525913c4b, 4ffcef37b0, d7d4ccaebe, 9212a6d43... |
| 2 | Q3, 2033 | {357569f603, 4525913c4b, 5c321f9221, 550da2620... |
| 3 | Q4, 2033 | {994da28b10, 357569f603, 4525913c4b, d7d4ccaeb... |
| 4 | Q1, 2034 | {4ffcef37b0, d7d4ccaebe, 3f6a4a3717, b4349f967... |
| 5 | Q2, 2034 | {89bac4c2cb, d7d4ccaebe, 5a3dc18cb6, 57e4ccf9c... |
| 6 | Q3, 2034 | {4ffcef37b0, 89bac4c2cb, d7d4ccaebe, 5a3dc18cb... |
| 7 | Q4, 2034 | {4ffcef37b0, 5a3dc18cb6, 57e4ccf9c8, b4349f967... |
churn_data["merchant_shift"] = churn_data["merchant"].shift(1)
churn_data["merchant_diff"] = churn_data["merchant_shift"] - churn_data["merchant"]
churn_data["new_merchant"] = churn_data["merchant"] - churn_data["merchant_shift"]
churn_data['merchant_shift'].fillna("", inplace=True)
churn_data['merchant_diff'].fillna("", inplace=True)
churn_data['new_merchant'].fillna("", inplace=True)
churn_data["merchant_shift_len"] = churn_data["merchant_shift"].apply(lambda x: len(x))
churn_data["merchant_diff_len"] = churn_data["merchant_diff"].apply(lambda x: len(x))
churn_data["new_merchant_len"] = churn_data["new_merchant"].apply(lambda x: len(x))
churn_data["churn_rate"] = round(churn_data["merchant_diff_len"] / churn_data["merchant_shift_len"] * 100, 2)
churn_data
| quarter_year | merchant | merchant_shift | merchant_diff | new_merchant | merchant_shift_len | merchant_diff_len | new_merchant_len | churn_rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Q1, 2033 | {309de2d8a2, b4349f967a, 1d5e86fa15, 32c922fbc... | 0 | 0 | 0 | NaN | |||
| 1 | Q2, 2033 | {4525913c4b, 4ffcef37b0, d7d4ccaebe, 9212a6d43... | {309de2d8a2, b4349f967a, 1d5e86fa15, 32c922fbc... | {27c63c3014, c171e7bbca, a4ad0f33d7, 68a8e9f36... | {4525913c4b, 4ffcef37b0, d7d4ccaebe, 9212a6d43... | 1316 | 441 | 1546 | 33.51 |
| 2 | Q3, 2033 | {357569f603, 4525913c4b, 5c321f9221, 550da2620... | {4525913c4b, 4ffcef37b0, d7d4ccaebe, 9212a6d43... | {4ffcef37b0, 6455e45d3d, 9ef640d727, fbbe30887... | {357569f603, 550da26208, 5c321f9221, 10f3baa29... | 2421 | 815 | 1797 | 33.66 |
| 3 | Q4, 2033 | {994da28b10, 357569f603, 4525913c4b, d7d4ccaeb... | {357569f603, 4525913c4b, 5c321f9221, 550da2620... | {c171e7bbca, 550da26208, 5c321f9221, 96764b7a1... | {994da28b10, ca16a3400a, 791688c2b5, 4aeaf1542... | 3403 | 1017 | 1987 | 29.89 |
| 4 | Q1, 2034 | {4ffcef37b0, d7d4ccaebe, 3f6a4a3717, b4349f967... | {994da28b10, 357569f603, 4525913c4b, d7d4ccaeb... | {994da28b10, ca16a3400a, 402b000e78, b38c305d4... | {a98ddbbbcf, 4ffcef37b0, 3f6a4a3717, d443bf6bf... | 4373 | 1250 | 2248 | 28.58 |
| 5 | Q2, 2034 | {89bac4c2cb, d7d4ccaebe, 5a3dc18cb6, 57e4ccf9c... | {4ffcef37b0, d7d4ccaebe, 3f6a4a3717, b4349f967... | {4ffcef37b0, 3f6a4a3717, 0af965e599, a07358569... | {462d50a6c8, c8c48c45e2, 89bac4c2cb, 5a3dc18cb... | 5371 | 1343 | 2471 | 25.00 |
| 6 | Q3, 2034 | {4ffcef37b0, 89bac4c2cb, d7d4ccaebe, 5a3dc18cb... | {89bac4c2cb, d7d4ccaebe, 5a3dc18cb6, 57e4ccf9c... | {a98ddbbbcf, 9212a6d430, 7d28bd59e5, 85d883a2c... | {4ffcef37b0, 14fb37c54a, 402b000e78, 48e03d323... | 6499 | 1705 | 2544 | 26.23 |
| 7 | Q4, 2034 | {4ffcef37b0, 5a3dc18cb6, 57e4ccf9c8, b4349f967... | {4ffcef37b0, 89bac4c2cb, d7d4ccaebe, 5a3dc18cb... | {89bac4c2cb, 14fb37c54a, d7d4ccaebe, d443bf6bf... | {a98ddbbbcf, b5184dc979, 1de8f93cb8, 1dd9c357d... | 7338 | 1784 | 2952 | 24.31 |
# Churn Merchant is a set of all merchants that were churned.
churn_merchant = set()
for i in range(len(churn_data)):
churn_merchant.update(churn_data["merchant_diff"][i])
len(set(churn_merchant))
7352
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
go.Scatter(x=churn_data["quarter_year"], y=churn_data["churn_rate"], name="Churn Rate"), secondary_y=True,
)
fig.add_trace(
go.Bar(
x=churn_data["quarter_year"],
y=churn_data["new_merchant_len"],
name="New Merchants Count",
textposition="auto",
)
)
fig.update_layout(
title_text="Churn Rate and New Merchant Addition Quaterly"
)
fig.update_xaxes(title_text="Yearly Quarters")
fig.update_yaxes(title_text="<b>New Merchant Count</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>Churn Rate %</b>", secondary_y=True)
fig.show()
transaction_data = transaction_data.sort_values(["time"])
transaction_data["month_year"] = transaction_data["month"].astype(str) + ", "+ transaction_data["year"].astype(str)
transaction_data['month_year'] = pd.Categorical(transaction_data['month_year'],
['1, 2033', '2, 2033','3, 2033','4, 2033','5, 2033','6, 2033',
'7, 2033','8, 2033','9, 2033','10, 2033','11, 2033','12, 2033',
'1, 2034', '2, 2034','3, 2034','4, 2034','5, 2034','6, 2034',
'7, 2034','8, 2034','9, 2034','10, 2034','11, 2034','12, 2034'])
transaction_data = transaction_data.sort_values("month_year")
transaction_data.head()
| merchant | time | amount_usd_in_cents | date | month | week_day | hour | quarter | year | amount | day_store | quarter_year | month_year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 731681 | 3e72388b82 | 2033-01-01 09:38:32 | 4335 | 2033-01-01 | 1 | 5 | 9 | 1 | 2033 | 43.35 | day | Q1, 2033 | 1, 2033 |
| 165023 | 654930c922 | 2033-01-12 14:34:40 | 4320 | 2033-01-12 | 1 | 2 | 14 | 1 | 2033 | 43.20 | day | Q1, 2033 | 1, 2033 |
| 1021324 | 654930c922 | 2033-01-12 14:34:58 | 12137 | 2033-01-12 | 1 | 2 | 14 | 1 | 2033 | 121.37 | day | Q1, 2033 | 1, 2033 |
| 147741 | 654930c922 | 2033-01-12 14:35:02 | 4979 | 2033-01-12 | 1 | 2 | 14 | 1 | 2033 | 49.79 | day | Q1, 2033 | 1, 2033 |
| 620394 | 654930c922 | 2033-01-12 14:35:02 | 74263 | 2033-01-12 | 1 | 2 | 14 | 1 | 2033 | 742.63 | day | Q1, 2033 | 1, 2033 |
churn_data = pd.DataFrame(transaction_data.groupby(["month_year"])['merchant'].apply(set))
churn_data.reset_index(inplace=True)
churn_data.head()
| month_year | merchant | |
|---|---|---|
| 0 | 1, 2033 | {309de2d8a2, bfa1275188, 1d5e86fa15, b09064cab... |
| 1 | 2, 2033 | {27c63c3014, 309de2d8a2, bfa1275188, 42a6de2ea... |
| 2 | 3, 2033 | {c171e7bbca, 99fd90aae8, 6455e45d3d, a4ad0f33d... |
| 3 | 4, 2033 | {4ffcef37b0, 6e2d3e8abd, 6455e45d3d, d7d4ccaeb... |
| 4 | 5, 2033 | {4525913c4b, 9212a6d430, 309de2d8a2, b4349f967... |
churn_data["merchant_shift"] = churn_data["merchant"].shift(1)
churn_data["merchant_diff"] = churn_data["merchant_shift"] - churn_data["merchant"]
churn_data["new_merchant"] = churn_data["merchant"] - churn_data["merchant_shift"]
churn_data['merchant_shift'].fillna("", inplace=True)
churn_data['merchant_diff'].fillna("", inplace=True)
churn_data['new_merchant'].fillna("", inplace=True)
churn_data["merchant_shift_len"] = churn_data["merchant_shift"].apply(lambda x: len(x))
churn_data["merchant_diff_len"] = churn_data["merchant_diff"].apply(lambda x: len(x))
churn_data["new_merchant_len"] = churn_data["new_merchant"].apply(lambda x: len(x))
churn_data["churn_rate"] = round(churn_data["merchant_diff_len"] / churn_data["merchant_shift_len"] * 100, 2)
churn_data
| month_year | merchant | merchant_shift | merchant_diff | new_merchant | merchant_shift_len | merchant_diff_len | new_merchant_len | churn_rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1, 2033 | {309de2d8a2, bfa1275188, 1d5e86fa15, b09064cab... | 0 | 0 | 0 | NaN | |||
| 1 | 2, 2033 | {27c63c3014, 309de2d8a2, bfa1275188, 42a6de2ea... | {309de2d8a2, bfa1275188, 1d5e86fa15, b09064cab... | {205577ab13, 7162acccfe, 7c673a8103, 3547b3b3b... | {27c63c3014, 42a6de2ea7, 32c922fbc6, 5d432730d... | 390 | 155 | 421 | 39.74 |
| 2 | 3, 2033 | {c171e7bbca, 99fd90aae8, 6455e45d3d, a4ad0f33d... | {27c63c3014, 309de2d8a2, bfa1275188, 42a6de2ea... | {27c63c3014, e9da792e9c, 7be67b18a7, ddb2d780f... | {c171e7bbca, 99fd90aae8, 6455e45d3d, 68a8e9f36... | 656 | 235 | 543 | 35.82 |
| 3 | 4, 2033 | {4ffcef37b0, 6e2d3e8abd, 6455e45d3d, d7d4ccaeb... | {c171e7bbca, 99fd90aae8, 6455e45d3d, a4ad0f33d... | {c171e7bbca, 99fd90aae8, a4ad0f33d7, 68a8e9f36... | {4ffcef37b0, 6e2d3e8abd, 9ef640d727, d7d4ccaeb... | 964 | 334 | 534 | 34.65 |
| 4 | 5, 2033 | {4525913c4b, 9212a6d430, 309de2d8a2, b4349f967... | {4ffcef37b0, 6e2d3e8abd, 6455e45d3d, d7d4ccaeb... | {4ffcef37b0, 6e2d3e8abd, 6455e45d3d, d7d4ccaeb... | {4525913c4b, 96764b7a17, 99fd90aae8, e76e42261... | 1164 | 398 | 636 | 34.19 |
| 5 | 6, 2033 | {309de2d8a2, b4349f967a, ab4be788a4, cd7a1fb3b... | {4525913c4b, 9212a6d430, 309de2d8a2, b4349f967... | {4525913c4b, 96764b7a17, 99fd90aae8, 9212a6d43... | {fbbe308873, fb30ba7dee, 540fedbb11, 287517b02... | 1402 | 480 | 717 | 34.24 |
| 6 | 7, 2033 | {357569f603, 4525913c4b, 5c321f9221, d7d4ccaeb... | {309de2d8a2, b4349f967a, ab4be788a4, cd7a1fb3b... | {9ef640d727, fbbe308873, 581afa8277, 9017e5e44... | {357569f603, 4525913c4b, 5c321f9221, 96764b7a1... | 1639 | 511 | 736 | 31.18 |
| 7 | 8, 2033 | {4525913c4b, 357569f603, 550da26208, 309de2d8a... | {357569f603, 4525913c4b, 5c321f9221, d7d4ccaeb... | {c171e7bbca, 5c321f9221, a47bba4c18, d7d4ccaeb... | {550da26208, 78ec3c097f, e92cdbdb4f, 0af965e59... | 1864 | 561 | 846 | 30.10 |
| 8 | 9, 2033 | {357569f603, 4525913c4b, 10f3baa29f, 309de2d8a... | {4525913c4b, 357569f603, 550da26208, 309de2d8a... | {550da26208, 99fd90aae8, 78ec3c097f, 85d883a2c... | {ded17864fa, 6e2d3e8abd, a47bba4c18, 75ffb6e26... | 2149 | 669 | 844 | 31.13 |
| 9 | 10, 2033 | {357569f603, 4525913c4b, d7d4ccaebe, b4349f967... | {357569f603, 4525913c4b, 10f3baa29f, 309de2d8a... | {96764b7a17, 6e2d3e8abd, e76e422617, 309de2d8a... | {3ee1c93bf7, 78ec3c097f, d7d4ccaebe, 9ef640d72... | 2324 | 662 | 900 | 28.49 |
| 10 | 11, 2033 | {994da28b10, 357569f603, 4525913c4b, d7d4ccaeb... | {357569f603, 4525913c4b, d7d4ccaebe, b4349f967... | {9f992fb6de, 9ef640d727, 287517b02c, b4349f967... | {994da28b10, e941d2a44e, c1f7c2ca72, 19db0e8d8... | 2562 | 771 | 994 | 30.09 |
| 11 | 12, 2033 | {994da28b10, 4525913c4b, 357569f603, 791688c2b... | {994da28b10, 357569f603, 4525913c4b, d7d4ccaeb... | {e941d2a44e, ded17864fa, 78ec3c097f, 6e2d3e8ab... | {3a7e7afa39, 9f992fb6de, 791688c2b5, 9d7e09fde... | 2785 | 729 | 1051 | 26.18 |
| 12 | 1, 2034 | {357569f603, 4525913c4b, d7d4ccaebe, 791688c2b... | {994da28b10, 4525913c4b, 357569f603, 791688c2b... | {994da28b10, 3a7e7afa39, 19db0e8d83, c1f7c2ca7... | {708dd98c6b, e941d2a44e, ded17864fa, d7d4ccaeb... | 3107 | 922 | 1042 | 29.67 |
| 13 | 2, 2034 | {a98ddbbbcf, 4525913c4b, 357569f603, 4ffcef37b... | {357569f603, 4525913c4b, d7d4ccaebe, 791688c2b... | {e941d2a44e, ded17864fa, a47bba4c18, 75ffb6e26... | {a98ddbbbcf, fa8a2c30d1, 4ffcef37b0, c1f7c2ca7... | 3227 | 847 | 1110 | 26.25 |
| 14 | 3, 2034 | {a98ddbbbcf, 357569f603, 4525913c4b, 791688c2b... | {a98ddbbbcf, 4525913c4b, 357569f603, 4ffcef37b... | {708dd98c6b, fa8a2c30d1, 4ffcef37b0, d7d4ccaeb... | {791688c2b5, 64d92c2b99, ea7cce3446, 01741912b... | 3490 | 852 | 1293 | 24.41 |
| 15 | 4, 2034 | {462d50a6c8, 357569f603, 4525913c4b, 791688c2b... | {a98ddbbbcf, 357569f603, 4525913c4b, 791688c2b... | {a98ddbbbcf, 75ffb6e266, 9ef640d727, 287517b02... | {462d50a6c8, 19db0e8d83, 3f78f8a93a, d6e40ea37... | 3931 | 1029 | 1177 | 26.18 |
| 16 | 5, 2034 | {a98ddbbbcf, 4525913c4b, 357569f603, d7d4ccaeb... | {462d50a6c8, 357569f603, 4525913c4b, 791688c2b... | {462d50a6c8, 59f062a3df, 19db0e8d83, 3f78f8a93... | {a98ddbbbcf, d7d4ccaebe, 432dc04805, 571ba7914... | 4079 | 995 | 1353 | 24.39 |
| 17 | 6, 2034 | {4525913c4b, 357569f603, 89bac4c2cb, c8c48c45e... | {a98ddbbbcf, 4525913c4b, 357569f603, d7d4ccaeb... | {a98ddbbbcf, fa8a2c30d1, 6e2d3e8abd, a532b4372... | {c8c48c45e2, 89bac4c2cb, 7035b85e82, 57e4ccf9c... | 4437 | 1170 | 1363 | 26.37 |
| 18 | 7, 2034 | {357569f603, 4525913c4b, c8c48c45e2, 89bac4c2c... | {4525913c4b, 357569f603, 89bac4c2cb, c8c48c45e... | {2ef0c8b74f, 59f062a3df, 99fd90aae8, 9b9c44388... | {14fb37c54a, 402b000e78, 432dc04805, 48e03d323... | 4630 | 1167 | 1326 | 25.21 |
| 19 | 8, 2034 | {4ffcef37b0, 5a3dc18cb6, 57e4ccf9c8, baeebf8f7... | {357569f603, 4525913c4b, c8c48c45e2, 89bac4c2c... | {0082de98e9, 89bac4c2cb, c8668b205a, 19db0e8d8... | {462d50a6c8, 4ffcef37b0, d443bf6bf7, 57e4ccf9c... | 4789 | 1093 | 1479 | 22.82 |
| 20 | 9, 2034 | {4ffcef37b0, d7d4ccaebe, 5a3dc18cb6, 57e4ccf9c... | {4ffcef37b0, 5a3dc18cb6, 57e4ccf9c8, baeebf8f7... | {c8c48c45e2, d443bf6bf7, 1d3569fe0d, acf291ff3... | {d7d4ccaebe, b4349f967a, 8d6e36b166, b335c06f5... | 5175 | 1279 | 1377 | 24.71 |
| 21 | 10, 2034 | {4ffcef37b0, 5a3dc18cb6, 57e4ccf9c8, b4349f967... | {4ffcef37b0, d7d4ccaebe, 5a3dc18cb6, 57e4ccf9c... | {708dd98c6b, 2ef0c8b74f, 59f062a3df, 0ef30be91... | {c8c48c45e2, b5184dc979, 1dd9c357d5, 309de2d8a... | 5273 | 1201 | 1668 | 22.78 |
| 22 | 11, 2034 | {4ffcef37b0, 5a3dc18cb6, 57e4ccf9c8, b4349f967... | {4ffcef37b0, 5a3dc18cb6, 57e4ccf9c8, b4349f967... | {c8c48c45e2, b5184dc979, 23eac36c6a, a2a052a62... | {a98ddbbbcf, 13b8e7039e, 70c45016c3, 90d1aa03a... | 5740 | 1404 | 1619 | 24.46 |
| 23 | 12, 2034 | {4ffcef37b0, 57e4ccf9c8, b4349f967a, baeebf8f7... | {4ffcef37b0, 5a3dc18cb6, 57e4ccf9c8, b4349f967... | {462d50a6c8, a98ddbbbcf, 5a3dc18cb6, 1dd9c357d... | {b5184dc979, 1de8f93cb8, 10f3baa29f, b38c305d4... | 5955 | 1401 | 1572 | 23.53 |
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
go.Scatter(x=churn_data["month_year"], y=churn_data["churn_rate"], name="Churn Rate"), secondary_y=True,
)
fig.add_trace(
go.Bar(
x=churn_data["month_year"],
y=churn_data["new_merchant_len"],
name="New Merchants Count",
textposition="auto",
)
)
fig.update_layout(
title_text="Churn Rate and New Merchant Addition Monthly"
)
fig.update_xaxes(title_text="Month, Year")
fig.update_yaxes(title_text="<b>New Merchant Count</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>Churn Rate %</b>", secondary_y=True)
fig.show()
# churn merchant contains the merchants that were churned
churn_merchant = set()
for i in range(len(churn_data)):
churn_merchant.update(churn_data["merchant_diff"][i])
len(set(churn_merchant))
10940
Making use of the Merchant Attributes already created.
merchant_attributes.loc[merchant_attributes["merchant"].isin(churn_merchant) , 'churn'] = 1
merchant_attributes['churn'].fillna(0, inplace=True)
merchant_attributes.head()
| merchant | store_time | median_sales | sales_transaction_profile | num_transactions | sales_transaction_count | q1_transactions | q2_transactions | q3_transactions | q4_transactions | q1_sales | q2_sales | q3_sales | q4_sales | avg_weekday | avg_weekend | churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ef1932584c | all_day | 43.00 | low | 7 | mediumLow | 0 | 0 | 7 | 0 | 0.00 | 0.00 | 700.07 | 0.00 | 122.854000 | 42.90 | 1.0 |
| 1 | 26afdd587f | night | 237.21 | high | 5 | mediumLow | 0 | 1 | 4 | 0 | 0.00 | 162.99 | 992.04 | 0.00 | 257.196667 | 191.72 | 1.0 |
| 2 | 40e34cf99b | all_day | 15.77 | low | 3 | mediumLow | 0 | 1 | 2 | 0 | 0.00 | 16.28 | 29.90 | 0.00 | 15.393333 | 0.00 | 1.0 |
| 3 | 49054f472d | all_day | 92.20 | mediumHigh | 3 | mediumLow | 0 | 0 | 0 | 3 | 0.00 | 0.00 | 0.00 | 250.98 | 83.660000 | 0.00 | 0.0 |
| 4 | eacdd1a861 | day | 108.06 | mediumHigh | 2 | low | 1 | 1 | 0 | 0 | 23.05 | 193.07 | 0.00 | 0.00 | 0.000000 | 108.06 | 1.0 |
churn_positive_data = merchant_attributes[merchant_attributes["churn"] == 1].sample(3000)
churn_negative_data = merchant_attributes[merchant_attributes["churn"] == 0].sample(3000)
train_data = pd.concat([churn_positive_data, churn_negative_data])
churn_positive_data = merchant_attributes[merchant_attributes["churn"] == 1].sample(3000)
churn_negative_data = merchant_attributes[merchant_attributes["churn"] == 0].sample(3000)
test_data = pd.concat([churn_positive_data, churn_negative_data])
# Encoding categorical variable to be used: store_time
cat_encoding_train = pd.get_dummies(train_data["store_time"])
train_data = pd.concat([train_data, cat_encoding_train], axis = 1)
cat_encoding_test = pd.get_dummies(test_data["store_time"])
test_data = pd.concat([test_data, cat_encoding_test], axis = 1)
cols = ["all_day", "day", "night", "median_sales", "num_transactions", "q1_transactions",
"q2_transactions", "q3_transactions", "q4_transactions", "q1_sales",
"q2_sales", "q3_sales", "q4_sales", "avg_weekday", "avg_weekend"]
train_data[cols]
| all_day | day | night | median_sales | num_transactions | q1_transactions | q2_transactions | q3_transactions | q4_transactions | q1_sales | q2_sales | q3_sales | q4_sales | avg_weekday | avg_weekend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6650 | 0 | 1 | 0 | 406.940 | 1 | 0 | 0 | 1 | 0 | 0.00 | 0.00 | 406.94 | 0.00 | 406.940000 | 0.000000 |
| 10710 | 1 | 0 | 0 | 43.730 | 101 | 23 | 31 | 36 | 11 | 1172.76 | 1498.69 | 1941.57 | 604.48 | 51.754872 | 51.597742 |
| 5709 | 1 | 0 | 0 | 42.120 | 53 | 0 | 18 | 3 | 32 | 0.00 | 1342.55 | 131.31 | 2151.88 | 69.668163 | 53.000000 |
| 3210 | 1 | 0 | 0 | 102.720 | 3 | 0 | 3 | 0 | 0 | 0.00 | 586.85 | 0.00 | 0.00 | 195.616667 | 0.000000 |
| 2969 | 1 | 0 | 0 | 5821.280 | 13 | 2 | 2 | 6 | 3 | 12719.48 | 29834.39 | 58486.04 | 15118.07 | 8935.229231 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10527 | 1 | 0 | 0 | 101.040 | 5 | 0 | 0 | 0 | 5 | 0.00 | 0.00 | 0.00 | 902.60 | 216.730000 | 35.680000 |
| 7903 | 1 | 0 | 0 | 44.380 | 163 | 0 | 7 | 96 | 60 | 0.00 | 259.25 | 4774.39 | 4546.57 | 56.453672 | 67.261143 |
| 5532 | 1 | 0 | 0 | 72.340 | 2255 | 402 | 558 | 643 | 652 | 33384.27 | 44458.36 | 52378.19 | 53195.64 | 79.834581 | 84.076733 |
| 10318 | 1 | 0 | 0 | 42.585 | 134 | 28 | 21 | 28 | 57 | 1404.20 | 955.35 | 1550.65 | 4396.12 | 69.419022 | 45.708810 |
| 13601 | 1 | 0 | 0 | 75.705 | 398 | 61 | 75 | 118 | 144 | 5180.28 | 7740.95 | 9837.55 | 13273.03 | 94.062759 | 83.806058 |
6000 rows × 15 columns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
"""
Using a baseline Ramdom Forest model to see how the churn data can be modeled.
"""
clf=RandomForestClassifier(n_estimators=10)
clf.fit(train_data[cols], train_data["churn"])
y_pred=clf.predict(test_data[cols])
print(classification_report(test_data["churn"], y_pred))
precision recall f1-score support
0.0 0.85 0.98 0.91 3000
1.0 0.98 0.83 0.90 3000
accuracy 0.90 6000
macro avg 0.91 0.90 0.90 6000
weighted avg 0.91 0.90 0.90 6000
# Filtering active merchants data.
active_merchant_data = merchant_attributes[merchant_attributes["churn"] == 0]
cat_encoding = pd.get_dummies(active_merchant_data["store_time"])
active_merchant_data = pd.concat([active_merchant_data, cat_encoding], axis = 1)
y_pred = clf.predict(active_merchant_data[cols])
active_merchant_data["predicted_churn"] = y_pred
predicted_churn_merchant = active_merchant_data[active_merchant_data["predicted_churn"] == 1]["merchant"]
predicted_churn_merchant
476 8a36f8c5a0
1884 f7c8aaaa65
2100 084c50fa18
2608 81b2d53a2c
2922 7ee333a634
...
12894 6b9acd8e3a
13030 4c367d1317
13719 829a32987b
13936 6ba9d16a6c
14145 7fcbc87b5c
Name: merchant, Length: 67, dtype: object